#plotting format
plotformat = theme(plot.title = element_text(face="bold",size = 17,hjust = 0.5),axis.title = element_text(face = "bold",size =15), axis.text.x = element_text(size=12), axis.title.y=element_text(size=14))+theme_bw()

theme_facet = function(base_size = 14, base_family = "Helvetica") {
  # Starts with theme_grey and then modify some parts
  theme_bw(base_size = base_size, base_family = base_family) %+replace%
    theme(
      strip.background = element_blank(),
      strip.text.x = element_text(size = 10),
      strip.text.y = element_text(size = 10),
      axis.text.x = element_blank(),
      axis.text.y = element_text(size=12,hjust=1),
      axis.ticks =  element_blank(), #element_line(colour = "black"), 
      axis.title.x= element_text(size=12),
      axis.title.y= element_text(size=12,angle=90),
      panel.background = element_blank(), 
      panel.border =element_blank(), 
      panel.grid.minor = element_blank(), 
      panel.spacing = unit(0.5, "lines"), 
      plot.background = element_blank(), 
      plot.margin = unit(c(0.3,  0.3, 0.3, 0.3), "lines"),
      axis.line.x = element_line(color="black", size = 0.5),
      axis.line.y = element_line(color="black", size = 0.5)
    )
}
#color
Features = c('#deebf7','#9ecae1','#6baed6','#4292c6','#08519c','#08306b',
             '#fee6ce','#fdae6b','#fd8d3c','#f16913','#a63603','#7f2704',
             '#f0f0f0','#bdbdbd','#969696','#737373','#252525','#000000',
             '#efedf5','#bcbddc','#9e9ac8','#807dba','#54278f','#3f007d')
time = '2500';
pilot_version = "70levels_V323";
dataPath = '/Volumes/Macintosh HD/Users/Pam_sf_wang/Documents/Perceptual_learning_project/Calibration/Calibration_analysis/Prolific_analysis/70levels/';


data_new_fname = paste0("Prolific_",pilot_version,"_",time,".csv")
demographic_new_fname = paste0("Prolific_",pilot_version,"_demographic",time,".csv")
postsurvey_new_fname = paste0("Prolific_",pilot_version,"_postsurvey",time,".csv")

postsurvey_dp_fname = paste0("Prolific_",pilot_version,"_survey_dp",time,".csv")

d_p_thre = 10/100
trial_thre = 95/100
age_thre = 35

load data

totalData = read.csv(paste0(dataPath,data_new_fname),header = T)
totalData = totalData[,-1]
demographic_information = read.csv(paste0(dataPath,demographic_new_fname),header = T)
postsurvey = read.csv(paste0(dataPath,postsurvey_new_fname),header = T)

Identify and remove outliers

summarize = dplyr::summarize
#check basic performance -- remove non-responding subjects
temp = totalData %>% group_by(subID,feature_index) %>% summarize(num_noresponses= sum(keys==-1), trialNum = length(subID), no_resp = sum(keys==-1)/trialNum)
sprintf("total trial number: %i; Number of subjects %i",temp$trialNum[1],dim(temp)[1])
## [1] "total trial number: 231; Number of subjects 81"
head(temp)
## # A tibble: 6 x 5
## # Groups:   subID [6]
##   subID   feature_index num_noresponses trialNum no_resp
##   <fct>           <int>           <int>    <int>   <dbl>
## 1 ""                  2              11      231  0.0476
## 2 10438bd             3               0      231  0     
## 3 1050095             2              24      231  0.104 
## 4 1070d1e             1               0      231  0     
## 5 10f4080             3               0      231  0     
## 6 1106b05             2               4      231  0.0173
###quantile: sample quantiles corresponding to the given probabilities
summary(temp$no_resp)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.000000 0.008658 0.045722 0.025974 1.000000
noresp_thre = quantile(temp$no_resp,prob = trial_thre)
sprintf("the last 5percent of non-resp rate (threshold for excluding subjects): %f",noresp_thre)
## [1] "the last 5percent of non-resp rate (threshold for excluding subjects): 0.129870"
ggplot(temp, aes(y= no_resp, x= subID,color = as.factor(feature_index)))+
  geom_point()+
  geom_hline(yintercept=noresp_thre, linetype="dashed", color = "red", size=0.5)+
  labs(title="No responses", x ="subjuect ID", y = "number of no responses")+
  plotformat

ggplot(temp, aes(y= no_resp, x= feature_index,color = as.factor(feature_index)))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  geom_hline(yintercept=noresp_thre, linetype="dashed", color = "red", size=0.5)+
  labs(title="No responses", x ="subjuect ID", y = "number of no responses")+
  plotformat
## Warning: Removed 1 rows containing missing values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

#no response subjects
removeSub = temp$subID[temp$no_resp>=noresp_thre]
totalData = totalData%>%filter(!subID %in% removeSub)
totalData = mutate(totalData, subID = as.factor(subID))
demographic_information = demographic_information%>%filter(!SubID %in%removeSub)


sprintf("remove no response subjects: %i; remain subjects %i",length(removeSub),length(unique(totalData$subID)))
## [1] "remove no response subjects: 5; remain subjects 76"
removeSub
## [1] 13599c4 13599c4 1612848 1c24bad 1d9ad2f
## 80 Levels:  10438bd 1050095 1070d1e 10f4080 1106b05 1119c1a ... c223b3d

Demographic information

genderCount = table(demographic_information$Gender)
EthnicityCount = table(demographic_information$Ethnicity)
RaceCount = table(demographic_information$Race)
genderCount
## 
##  F  M 
## 34 42
EthnicityCount
## 
##  HL NHL 
##   3  69
RaceCount
## 
##  A AA AI  M  O  W 
##  8  3  1  5  1 56
summary(demographic_information$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   25.00   29.00   28.17   32.00   36.00
  ggplot(demographic_information, aes(demographic_information$Age)) + 
    geom_histogram()+
    geom_vline(xintercept=median(demographic_information$Age), linetype="dashed", color = "red", size=0.5)+
    #geom_vline(xintercept=mean(demographic_information$Age), linetype="dashed", color = "blue", size=0.5)+
    labs(title = "Age Distribution",x = "age", y = "counts")+
    plotformat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Restrict age

#select right age
remainSub = demographic_information$SubID[is.na(demographic_information$Age) | demographic_information$Age<=age_thre]
demographic_information_remain = filter(demographic_information,SubID%in%remainSub)
  
summary(demographic_information_remain$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   25.00   29.00   28.07   32.00   35.00
ggplot(demographic_information_remain, aes(Age)) + 
  geom_histogram()+
  geom_vline(xintercept=median(demographic_information_remain$Age), linetype="dashed", color = "red", size=0.5)+
  #geom_vline(xintercept=mean(demographic_information$Age), linetype="dashed", color = "blue", size=0.5)+
  labs(title = "Age Distribution",x = "age", y = "counts")+
  plotformat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

genderCount = table(demographic_information_remain$Gender)
EthnicityCount = table(demographic_information_remain$Ethnicity)
RaceCount = table(demographic_information_remain$Race)
genderCount
## 
##  F  M 
## 33 42
EthnicityCount
## 
##  HL NHL 
##   3  68
RaceCount
## 
##  A AA AI  M  O  W 
##  7  3  1  5  1 56
#remove subjects who are outside of the age range
removeSubAge = demographic_information$SubID[!is.na(demographic_information$Age) & demographic_information$Age>age_thre]
totalData = totalData %>%filter(subID %in%remainSub)
sprintf("remove over aged subejcts %i; remain subjects: %i", length(removeSubAge),length(unique(totalData$subID)))
## [1] "remove over aged subejcts 1; remain subjects: 75"

Add conditions

summarize = dplyr::summarize
#trial conditions: different:D; same:S (same as keys)
totalData$SameDiff = NA;
totalData$SameDiff[totalData$Fimg==totalData$Simg]="S";
totalData$SameDiff[totalData$Fimg!=totalData$Simg]="D";
  
totalData$feature_index = as.factor(totalData$feature_index)
  
#trial conditions: level
totalData$level_diff = as.factor(abs(totalData$Fimg-totalData$Simg))

#pair identity
temp = data.frame()
for (irow in 1:dim(totalData)[1]){
  tempPair = sort(c(totalData$Fimg[irow],totalData$Simg[irow]))
  tempName = paste(tempPair,collapse = "_")
  tempName = paste(tempName,totalData$feature_index[irow],sep="_F")
  totalData$PairIdentity[irow]=tempName
  rm(tempName)
}

Task structure

temptrialNumbers=totalData %>%group_by(subID,feature_index,SameDiff)%>%summarize(num_same = length(SameDiff))
head(temptrialNumbers)
## # A tibble: 6 x 4
## # Groups:   subID, feature_index [3]
##   subID   feature_index SameDiff num_same
##   <fct>   <fct>         <chr>       <int>
## 1 ""      2             D             105
## 2 ""      2             S             126
## 3 10438bd 3             D             105
## 4 10438bd 3             S             126
## 5 1050095 2             D             105
## 6 1050095 2             S             126
temptrialNumbers %>%group_by(feature_index)%>%summarize(num_sub = length(subID)/2)
## # A tibble: 3 x 2
##   feature_index num_sub
##   <fct>           <dbl>
## 1 1                  23
## 2 2                  25
## 3 3                  27

Behavior analysis

summarize = dplyr::summarize
#over all performance
basic = totalData %>% filter(keys>=0, run==3)%>% group_by(subID)%>% summarize(
  hit = sum(keys==1&SameDiff=="S"), 
  fa= sum(keys==1&SameDiff=="D"), 
  cr = sum(keys==0&SameDiff=="D"),
  miss = sum(keys==0&SameDiff=="S"),
  HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
  FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
  CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
  MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"), 
  adjFARate = 1/(2*sum(SameDiff=="D")),
  adjHitRate = 1-(1/(2*sum(SameDiff=="S"))))

basic$FARate[basic$FARate==0]=basic$adjFARate[basic$FARate==0]
basic$HitRate[basic$HitRate[!is.na(basic$HitRate)]==1]=basic$adjHitRate[basic$HitRate[!is.na(basic$HitRate)]==1]

row_index = basic$CRRate==1 & basic$MissRate==1
basic$FARate[row_index]=NA
basic$HitRate[row_index]=NA
basic$adjFARate[row_index]=NA
basic$adjHitRate[row_index]=NA

row_index = basic$CRRate==0 & basic$MissRate==0
basic$FARate[row_index]=NA
basic$HitRate[row_index]=NA
basic$adjFARate[row_index]=NA
basic$adjHitRate[row_index]=NA

basic = mutate(basic, d_p = qnorm(HitRate)-qnorm(FARate))

#Plotting
basic_longform = gather(basic, response_type, number_resp, HitRate,FARate,CRRate,MissRate)
head(basic_longform)
## # A tibble: 6 x 10
##   subID   hit    fa    cr  miss adjFARate adjHitRate   d_p response_type
##   <fct> <int> <int> <int> <int>     <dbl>      <dbl> <dbl> <chr>        
## 1 ""       28     9    24     9    0.0152      0.986  1.30 HitRate      
## 2 1043…    42     0    35     0    0.0143      0.988  4.45 HitRate      
## 3 1050…    31    12    17     7    0.0172      0.987  1.12 HitRate      
## 4 1070…    41     2    33     1    0.0143      0.988  3.56 HitRate      
## 5 10f4…    41     1    34     1    0.0143      0.988  3.88 HitRate      
## 6 1106…    41     1    33     1    0.0147      0.988  3.87 HitRate      
## # ... with 1 more variable: number_resp <dbl>
ggplot(basic_longform, aes(x = response_type, y = number_resp, color = response_type))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  labs(title="Proportaion response type", x = "response type", y = "proportion")+
  plotformat
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).

#d-p
ggplot(basic, aes(x = subID, y = d_p, color = as.factor(subID)))+
  geom_point(size=3)+
  labs(title="d prime", x = "subject", y = "d prime")+
  plotformat
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(basic,aes(d_p))+
  geom_histogram()+
  labs(title="d prime histogram", x = "d prime", y = "counts")+
  plotformat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

#bad d_p

check subjects with low d-p & save survey data

removedp_thre = quantile(basic$d_p[!is.na(basic$d_p)],prob = d_p_thre)
removeSubdp = basic$subID[basic$d_p<removedp_thre | is.na(basic$d_p)]
sprintf("low d-prime %f. subjects: %i",removedp_thre,length(removeSubdp))
## [1] "low d-prime 0.722628. subjects: 9"
remainSubdp = basic$subID[basic$d_p>d_p_thre]
sprintf("pass d-prim threshould: %i", length(remainSubdp))
## [1] "pass d-prim threshould: 72"
#sprintf("remove d prime lower than %f. Sequence 01 remain subjects: %i; sequence 02 remain subjects: %i",d_p_thre, #length(remain_dp_s1),length(remain_dp_s2))

for (irow in 1:dim(postsurvey)[1]){
  for (irow_basic in 1:dim(basic)[1]){
    if (postsurvey$SubID[irow]==basic$subID[irow_basic]){
      postsurvey$basic_id[irow] = as.character(basic$subID[irow_basic])
      postsurvey$d_p[irow] = basic$d_p[irow_basic]
      postsurvey$feature[irow] = unique(totalData$feature_index[totalData$subID==postsurvey$SubID[irow]])
    }
  }
}

write.csv(postsurvey, file = paste0(postsurvey_dp_fname,".csv"))

##remove low dp 3rd run
num_lowdp = totalData %>% filter(subID %in% as.character(removeSubdp)) %>% group_by(feature_index) %>% summarize(num_trial = length(unique(subID)))
num_lowdp
## # A tibble: 3 x 2
##   feature_index num_trial
##   <fct>             <int>
## 1 1                     2
## 2 2                     4
## 3 3                     3
totalData = totalData%>% filter(!subID %in% removeSubdp)

by features – within Features

summarize = dplyr::summarize
##########
basicFeature = totalData  %>%group_by(subID,feature_index)%>% summarize(
            hit = sum(keys==1&SameDiff=="S"), 
            fa= sum(keys==1&SameDiff=="D"), 
            cr = sum(keys==0&SameDiff=="D"),
            miss = sum(keys==0&SameDiff=="S"),
            HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
            FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
            CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
            MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"), 
            adjFARate = 1/(2*sum(SameDiff=="D")),
            adjHitRate = 1-(1/(2*sum(SameDiff=="S"))))

basicFeature$FARate[basicFeature$FARate==0]=basicFeature$adjFARate[basicFeature$FARate==0]
basicFeature$HitRate[basicFeature$HitRate[!is.na(basicFeature$HitRate)]==1]=basicFeature$adjHitRate[basicFeature$HitRate[!is.na(basicFeature$HitRate)]==1]

row_index = basicFeature$CRRate==1 & basicFeature$MissRate==1
basicFeature$FARate[row_index]=NA
basicFeature$HitRate[row_index]=NA
basicFeature$adjFARate[row_index]=NA
basicFeature$adjHitRate[row_index]=NA

row_index = basicFeature$CRRate==0 & basicFeature$MissRate==0
basicFeature$FARate[row_index]=NA
basicFeature$HitRate[row_index]=NA
basicFeature$adjFARate[row_index]=NA
basicFeature$adjHitRate[row_index]=NA

basicFeature = mutate(basicFeature, d_p = qnorm(HitRate)-qnorm(FARate))

###
basicFeature_longform = gather(basicFeature, response_type, number_resp, HitRate,FARate,CRRate,MissRate)
head(basicFeature_longform)
## # A tibble: 6 x 11
## # Groups:   subID [6]
##   subID feature_index   hit    fa    cr  miss adjFARate adjHitRate   d_p
##   <fct> <fct>         <int> <int> <int> <int>     <dbl>      <dbl> <dbl>
## 1 ""    2               100    15    86    19   0.00476      0.996  1.89
## 2 1043… 3               117    13    92     9   0.00476      0.996  2.62
## 3 1050… 2                91    31    62    23   0.00476      0.996  1.13
## 4 1070… 1               123    21    84     3   0.00476      0.996  2.82
## 5 10f4… 3               124     5   100     2   0.00476      0.996  3.82
## 6 1106… 2               121    12    90     4   0.00476      0.996  2.96
## # ... with 2 more variables: response_type <chr>, number_resp <dbl>
ggplot(basicFeature_longform, aes(x = response_type, y = number_resp, color = response_type))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  facet_wrap(~feature_index)+
  labs(title="Proportaion response type by feature", x = "response type", y = "proportion")+
  theme_facet()

ggplot(basicFeature, aes(x = feature_index, y = d_p, group = feature_index, color = feature_index))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  plotformat

#ggplot(basicFeature, aes(x = subID, y = d_p, group = feature_index, color = feature_index))+
#  geom_point(size=3)+
#  plotformat

stats

basicFeature$feature_index[is.infinite(basicFeature$d_p)]
## factor(0)
## Levels: 1 2 3
basicFeature$subID[is.infinite(basicFeature$d_p)]
## factor(0)
## 80 Levels:  10438bd 1050095 1070d1e 10f4080 1106b05 1119c1a ... c223b3d
feature_anova = aov(d_p~feature_index,basicFeature)
summary(feature_anova)
##               Df Sum Sq Mean Sq F value Pr(>F)
## feature_index  2   3.11  1.5530    1.94  0.152
## Residuals     63  50.44  0.8006

by features by runs – within Features

summarize = dplyr::summarize
##########
basicFeature = totalData  %>%group_by(subID,feature_index,run)%>% summarize(
            hit = sum(keys==1&SameDiff=="S"), 
            fa= sum(keys==1&SameDiff=="D"), 
            cr = sum(keys==0&SameDiff=="D"),
            miss = sum(keys==0&SameDiff=="S"),
            HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
            FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
            CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
            MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"), 
            adjFARate = 1/(2*sum(SameDiff=="D")),
            adjHitRate = 1-(1/(2*sum(SameDiff=="S"))))

basicFeature$FARate[basicFeature$FARate==0]=basicFeature$adjFARate[basicFeature$FARate==0]
basicFeature$HitRate[basicFeature$HitRate[!is.na(basicFeature$HitRate)]==1]=basicFeature$adjHitRate[basicFeature$HitRate[!is.na(basicFeature$HitRate)]==1]

row_index = basicFeature$CRRate==1 & basicFeature$MissRate==1
basicFeature$FARate[row_index]=NA
basicFeature$HitRate[row_index]=NA
basicFeature$adjFARate[row_index]=NA
basicFeature$adjHitRate[row_index]=NA

row_index = basicFeature$CRRate==0 & basicFeature$MissRate==0
basicFeature$FARate[row_index]=NA
basicFeature$HitRate[row_index]=NA
basicFeature$adjFARate[row_index]=NA
basicFeature$adjHitRate[row_index]=NA

basicFeature = mutate(basicFeature, d_p = qnorm(HitRate)-qnorm(FARate))
###
ggplot(basicFeature, aes(x = feature_index, y = d_p, group = feature_index, color = feature_index))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  facet_wrap(~run)+
  plotformat

#ggplot(basicFeature, aes(x = subID, y = d_p, group = feature_index, color = feature_index))+
#  geom_point(size=3)+
#  plotformat

stats

basicFeature_run3 = basicFeature %>% filter(run==3, !is.infinite(d_p))
anovaFeature3rdRun = aov(d_p~feature_index,basicFeature_run3)
summary(anovaFeature3rdRun)
##               Df Sum Sq Mean Sq F value Pr(>F)
## feature_index  2   2.28   1.139   1.044  0.358
## Residuals     63  68.79   1.092
summarize = dplyr::summarize
compareFeature3rdRun = basicFeature  %>%filter(run==3,!is.infinite(d_p)) %>% group_by(feature_index) %>%summarize(meanD_p = mean(d_p),std =sd(d_p),med = median(d_p) )
compareFeature3rdRun
## # A tibble: 3 x 4
##   feature_index meanD_p   std   med
##   <fct>           <dbl> <dbl> <dbl>
## 1 1                3.31 0.838  3.35
## 2 2                2.88 1.25   2.87
## 3 3                3.24 1.01   3.53
#top 10 subjects
topdp = quantile(basicFeature$d_p,90/100)
tempBasicFeature = basicFeature %>% filter(run==3)
top10 = tempBasicFeature %>%filter(d_p>=topdp)
top10
## # A tibble: 16 x 14
## # Groups:   subID, feature_index [16]
##    subID feature_index   run   hit    fa    cr  miss HitRate FARate CRRate
##    <fct> <fct>         <int> <int> <int> <int> <int>   <dbl>  <dbl>  <dbl>
##  1 1043… 3                 3    42     0    35     0   0.988 0.0143  1    
##  2 1545… 1                 3    42     0    35     0   0.988 0.0143  1    
##  3 16c2… 3                 3    42     1    34     0   0.988 0.0286  0.971
##  4 174f… 3                 3    41     0    34     1   0.976 0.0143  0.971
##  5 175a… 3                 3    42     0    34     0   0.988 0.0143  0.971
##  6 1791… 1                 3    42     0    35     0   0.988 0.0143  1    
##  7 1919… 1                 3    42     0    35     0   0.988 0.0143  1    
##  8 1934… 2                 3    42     0    35     0   0.988 0.0143  1    
##  9 1b47… 3                 3    42     1    34     0   0.988 0.0286  0.971
## 10 1bee… 2                 3    42     0    35     0   0.988 0.0143  1    
## 11 1d48… 2                 3    42     0    35     0   0.988 0.0143  1    
## 12 1da7… 3                 3    42     1    34     0   0.988 0.0286  0.971
## 13 1df9… 2                 3    42     0    35     0   0.988 0.0143  1    
## 14 1e20… 1                 3    42     1    34     0   0.988 0.0286  0.971
## 15 1f94… 1                 3    42     0    35     0   0.988 0.0143  1    
## 16 c223… 2                 3    42     0    35     0   0.988 0.0143  1    
## # ... with 4 more variables: MissRate <dbl>, adjFARate <dbl>,
## #   adjHitRate <dbl>, d_p <dbl>

within a feature: levels

summarize = dplyr::summarize

basicLevel = totalData %>% filter(keys>=0)%>% group_by(subID,level_diff)%>% 
summarize(
            hit = sum(keys==1&SameDiff=="S"), 
            fa= sum(keys==1&SameDiff=="D"), 
            cr = sum(keys==0&SameDiff=="D"),
            miss = sum(keys==0&SameDiff=="S"),
            HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
            FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
            CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
            MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"))

####
basicLevel_longform = gather(basicLevel, response_type, resp_rate, HitRate,FARate,CRRate,MissRate)
head(basicLevel_longform)
## # A tibble: 6 x 8
## # Groups:   subID [1]
##   subID level_diff   hit    fa    cr  miss response_type resp_rate
##   <fct> <fct>      <int> <int> <int> <int> <chr>             <dbl>
## 1 ""    0            100     0     0    19 HitRate           0.840
## 2 ""    1              0     7    13     0 HitRate         NaN    
## 3 ""    2              0     2    19     0 HitRate         NaN    
## 4 ""    3              0     1    19     0 HitRate         NaN    
## 5 ""    4              0     4    16     0 HitRate         NaN    
## 6 ""    5              0     1    19     0 HitRate         NaN
ggplot(basicLevel_longform, aes(x = response_type, y = resp_rate, color = response_type))+
  geom_boxplot(fill = "white",lwd = 1)+
  geom_jitter(width=0.2,alpha = 0.5)+
  facet_wrap(~level_diff,nrow = 1)+
  labs(title="Proportaion response type by difficulty level", x = "response type", y = "proportion")+
  theme_facet()
## Warning: Removed 792 rows containing non-finite values (stat_boxplot).
## Warning: Removed 792 rows containing missing values (geom_point).

###########
basicLevelFeature = totalData %>% filter(keys>=0)%>%group_by(subID,level_diff,feature_index)%>% 
summarize(  accuracy = sum(accuracy)/length(subID),
            hit = sum(keys==1&SameDiff=="S"), 
            fa= sum(keys==1&SameDiff=="D"), 
            cr = sum(keys==0&SameDiff=="D"),
            miss = sum(keys==0&SameDiff=="S"),
            HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
            FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
            CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
            MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"))
  basicLevelFeature_longform = gather(basicLevelFeature, response_type, resp_rate, HitRate,FARate,CRRate,MissRate)
  head(basicLevelFeature_longform)
## # A tibble: 6 x 10
## # Groups:   subID, level_diff [6]
##   subID level_diff feature_index accuracy   hit    fa    cr  miss
##   <fct> <fct>      <fct>            <dbl> <int> <int> <int> <int>
## 1 ""    0          2                0.840   100     0     0    19
## 2 ""    1          2                0.65      0     7    13     0
## 3 ""    2          2                0.905     0     2    19     0
## 4 ""    3          2                0.95      0     1    19     0
## 5 ""    4          2                0.8       0     4    16     0
## 6 ""    5          2                0.95      0     1    19     0
## # ... with 2 more variables: response_type <chr>, resp_rate <dbl>
  ggplot(basicLevelFeature_longform, aes(x = response_type, y = resp_rate, color = response_type))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    facet_wrap(feature_index~level_diff,nrow = 3,strip.position = "bottom")+
    labs(title="Proportaion resp type within Feature by difficulty level", x = "level diff", y = "proportion")+
    theme_facet()
## Warning: Removed 792 rows containing non-finite values (stat_boxplot).

## Warning: Removed 792 rows containing missing values (geom_point).

  ggplot(basicLevelFeature, aes(x=feature_index,y=accuracy,color = feature_index))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    facet_wrap(~level_diff)+
    plotformat

  ggplot(basicLevelFeature, aes(x=level_diff,y=accuracy,color = feature_index))+
    geom_boxplot(fill = "white",lwd = 1)+
    plotformat

  ggplot(filter(basicLevelFeature,feature_index==1), aes(x=level_diff,y=accuracy))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 01", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    plotformat
## Warning: Removed 21 rows containing missing values (geom_point).

  ggplot(filter(basicLevelFeature,feature_index==2), aes(x=level_diff,y=accuracy))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 02", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    plotformat
## Warning: Removed 11 rows containing missing values (geom_point).

  ggplot(filter(basicLevelFeature,feature_index==3), aes(x=level_diff,y=accuracy))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 03", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    plotformat
## Warning: Removed 9 rows containing missing values (geom_point).

stats

anovaFeatureLevel = aov(accuracy~feature_index+level_diff,basicLevelFeature)
summary(anovaFeatureLevel)
##                Df Sum Sq Mean Sq F value   Pr(>F)    
## feature_index   2  0.306  0.1532    7.87 0.000446 ***
## level_diff      5  5.296  1.0591   54.40  < 2e-16 ***
## Residuals     388  7.553  0.0195                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summarize = dplyr::summarize
compareFeatures = basicLevelFeature  %>%group_by(feature_index,level_diff) %>%summarize(meanAccuracy = mean(accuracy),std =sd(accuracy) )
compareFeatures
## # A tibble: 18 x 4
## # Groups:   feature_index [?]
##    feature_index level_diff meanAccuracy    std
##    <fct>         <fct>             <dbl>  <dbl>
##  1 1             0                 0.945 0.0502
##  2 1             1                 0.651 0.254 
##  3 1             2                 0.868 0.0895
##  4 1             3                 0.935 0.0678
##  5 1             4                 0.954 0.0560
##  6 1             5                 0.960 0.0504
##  7 2             0                 0.918 0.0680
##  8 2             1                 0.590 0.274 
##  9 2             2                 0.878 0.117 
## 10 2             3                 0.903 0.109 
## 11 2             4                 0.925 0.0851
## 12 2             5                 0.930 0.0924
## 13 3             0                 0.913 0.0878
## 14 3             1                 0.561 0.239 
## 15 3             2                 0.825 0.169 
## 16 3             3                 0.859 0.130 
## 17 3             4                 0.873 0.116 
## 18 3             5                 0.879 0.151

separate by run

basicLevelFeature = totalData %>% group_by(subID,level_diff,feature_index,run)%>% 
summarize(  accuracy = sum(accuracy)/length(subID),
            hit = sum(keys==1&SameDiff=="S"), 
            fa= sum(keys==1&SameDiff=="D"), 
            cr = sum(keys==0&SameDiff=="D"),
            miss = sum(keys==0&SameDiff=="S"),
            HitRate =sum(keys==1&SameDiff=="S")/sum(SameDiff=="S"), 
            FARate = sum(keys==1&SameDiff=="D")/sum(SameDiff=="D"), 
            CRRate = sum(keys==0&SameDiff=="D")/sum(SameDiff=="D"),
            MissRate = sum(keys==0&SameDiff=="S")/sum(SameDiff=="S"))  #filter(keys>=0)%>%
  basicLevelFeature_longform = gather(basicLevelFeature, response_type, resp_rate, HitRate,FARate,CRRate,MissRate)
  head(basicLevelFeature_longform)
## # A tibble: 6 x 11
## # Groups:   subID, level_diff, feature_index [2]
##   subID level_diff feature_index   run accuracy   hit    fa    cr  miss
##   <fct> <fct>      <fct>         <int>    <dbl> <int> <int> <int> <int>
## 1 ""    0          2                 1    0.929    39     0     0     3
## 2 ""    0          2                 2    0.738    33     0     0     7
## 3 ""    0          2                 3    0.548    28     0     0     9
## 4 ""    1          2                 1    0.857     0     1     6     0
## 5 ""    1          2                 2    0.857     0     1     6     0
## 6 ""    1          2                 3    0         0     5     1     0
## # ... with 2 more variables: response_type <chr>, resp_rate <dbl>
  ggplot(filter(basicLevelFeature,feature_index==1), aes(x=level_diff,y=accuracy,group = level_diff,color=subID))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 01", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    facet_wrap(~run)+
    plotformat
## Warning: Removed 97 rows containing missing values (geom_point).

  ggplot(filter(basicLevelFeature,feature_index==2), aes(x=level_diff,y=accuracy,group = level_diff,color=subID))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 02", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    facet_wrap(~run)+
    plotformat
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 75 rows containing missing values (geom_point).

  ggplot(filter(basicLevelFeature,feature_index==3), aes(x=level_diff,y=accuracy,group = level_diff,color=subID))+
    geom_boxplot(fill = "white",lwd = 1)+
    geom_jitter(width=0.2,alpha = 0.5)+
    labs(title="Feauture 03", x = "level diff", y = "accuracy")+
    ylim(0,1)+
    facet_wrap(~run)+
    plotformat
## Warning: Removed 102 rows containing missing values (geom_point).

stats

anovaFeatureLevel = aov(accuracy~feature_index+level_diff+run,basicLevelFeature)
summary(anovaFeatureLevel)
##                 Df Sum Sq Mean Sq F value  Pr(>F)    
## feature_index    2   0.58   0.291   6.927 0.00102 ** 
## level_diff       5  14.25   2.851  67.793 < 2e-16 ***
## run              1   3.54   3.540  84.190 < 2e-16 ***
## Residuals     1179  49.58   0.042                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anovaFeatureLevel3rdRun = aov(accuracy~feature_index+level_diff,filter(basicLevelFeature,run==3))
summary(anovaFeatureLevel3rdRun)
##                Df Sum Sq Mean Sq F value Pr(>F)    
## feature_index   2  0.219  0.1096   3.189 0.0423 *  
## level_diff      5  4.325  0.8650  25.160 <2e-16 ***
## Residuals     388 13.340  0.0344                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anovaFeatureLevel3rdRun_same = aov(accuracy~feature_index,filter(basicLevelFeature,run==3 & level_diff==0))
summary(anovaFeatureLevel3rdRun_same)
##               Df Sum Sq Mean Sq F value Pr(>F)
## feature_index  2 0.0502 0.02509   1.795  0.175
## Residuals     63 0.8806 0.01398
SameaccuracyF1 = filter(basicLevelFeature,feature_index==1,run==3 & level_diff==0)
SameaccuracyF2 = filter(basicLevelFeature,feature_index==2,run==3 & level_diff==0)
SameaccuracyF3 = filter(basicLevelFeature,feature_index==3,run==3 & level_diff==0)
t.test(SameaccuracyF1$accuracy,SameaccuracyF2$accuracy)
## 
##  Welch Two Sample t-test
## 
## data:  SameaccuracyF1$accuracy and SameaccuracyF2$accuracy
## t = 1.6226, df = 30.873, p-value = 0.1148
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.01545197  0.13563338
## sample estimates:
## mean of x mean of y 
## 0.9342404 0.8741497
t.test(SameaccuracyF1$accuracy,SameaccuracyF3$accuracy)
## 
##  Welch Two Sample t-test
## 
## data:  SameaccuracyF1$accuracy and SameaccuracyF3$accuracy
## t = 0.057834, df = 41.232, p-value = 0.9542
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.05767587  0.06107723
## sample estimates:
## mean of x mean of y 
## 0.9342404 0.9325397
t.test(SameaccuracyF2$accuracy,SameaccuracyF3$accuracy)
## 
##  Welch Two Sample t-test
## 
## data:  SameaccuracyF2$accuracy and SameaccuracyF3$accuracy
## t = -1.455, df = 37.44, p-value = 0.154
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.13966894  0.02288889
## sample estimates:
## mean of x mean of y 
## 0.8741497 0.9325397
anovaFeatureLevel3rdRun_l01 = aov(accuracy~feature_index,filter(basicLevelFeature,run==3 & level_diff==1))
summary(anovaFeatureLevel3rdRun_l01)
##               Df Sum Sq Mean Sq F value Pr(>F)
## feature_index  2  0.181 0.09063    0.86  0.428
## Residuals     63  6.642 0.10544
summarize = dplyr::summarize
compareFeatures = basicLevelFeature  %>% filter(run==3) %>% group_by(feature_index,level_diff) %>%summarize(meanAccuracy = mean(accuracy),std =sd(accuracy),med = median(accuracy) )
compareFeatures
## # A tibble: 18 x 5
## # Groups:   feature_index [?]
##    feature_index level_diff meanAccuracy    std   med
##    <fct>         <fct>             <dbl>  <dbl> <dbl>
##  1 1             0                 0.934 0.0811 0.952
##  2 1             1                 0.721 0.237  0.714
##  3 1             2                 0.891 0.180  1    
##  4 1             3                 0.966 0.0770 1    
##  5 1             4                 0.973 0.0731 1    
##  6 1             5                 0.925 0.154  1    
##  7 2             0                 0.874 0.149  0.929
##  8 2             1                 0.592 0.381  0.714
##  9 2             2                 0.884 0.179  1    
## 10 2             3                 0.905 0.138  1    
## 11 2             4                 0.884 0.205  1    
## 12 2             5                 0.932 0.133  1    
## 13 3             0                 0.933 0.115  0.976
## 14 3             1                 0.637 0.337  0.643
## 15 3             2                 0.881 0.244  1    
## 16 3             3                 0.958 0.107  1    
## 17 3             4                 0.964 0.0868 1    
## 18 3             5                 0.952 0.109  1